Space

data <- read.csv('/Users/alexlange/Downloads/DATS_Intro_to_Data_Science/general_sample_result_fixed.csv', header = TRUE, sep = ",")
test <- data$r_med_geo * sin(data$pm/1000*pi/648000) * 978462
plot3d(x=data$ra, y=data$dec, z=data$r_med_geo,col = test, type = 'p', radius = data$parallax/max(data$parallax),xlab="RA", ylab="DEC", zlab="DIST",alpha=.3)

Space

open3d()                                   # create new plot
## null 
##    4
spheres3d(x = 1, y = 1, z = 1, radius = .1) # produce sphere
axes3d()   

#Space

#plot3d(

library(SPADAR) #taken from https://search.r-project.org/CRAN/refmans/SPADAR/html/SPADAR-package.html
createAllSkyScatterPlotChart(data$ra, data$dec, mainGrid="galactic",
                             dataCoordSys="equatorial", pointcol=test, pch=19, 
                             pointsize=data$parallax/max(data$parallax),
                             eqDraw=TRUE, eclDraw=FALSE, galDraq=TRUE, galCol="black", eqLty=2, galLty=1, 
                             eqCol=rgb(1,0,0,0.5))

data <- read.csv('/Users/alexlange/Downloads/DATS_Intro_to_Data_Science/general_sample_result_fixed.csv', header = TRUE, sep = ",")
corr_data <- data[,c("ra","dec","parallax","pm","pmra","pmdec","teff_gspphot","logg_gspphot","mh_gspphot","alphafe_gspspec","fem_gspspec","sife_gspspec","cafe_gspspec","feiim_gspspec")]
pairs(corr_data,upper.panel = NULL)

data.cor = cor(corr_data,use="pairwise.complete.obs")
corrplot(data.cor,method="ellipse",type="lower",diag = FALSE,tl.col="black")

data.sig = cor.mtest(data.cor, conf.level= .95)
corrplot(data.cor,
         method = "number",
         type = "lower",
         diag = FALSE,
         tl.col = "black",
         tl.srt = 45,
         p.mat = data.sig$p,
         sig.level = .1,
         na.label=" ")

sample<-read_csv("general_sample_result.csv")
high<-read_csv("v_tan_500km_s-result.csv")
sample$v_tan <- sample$r_med_geo * sin(sample$pm/1000*pi/648000) * 978462
high$v_tan <- high$r_med_geo * sin(high$pm/1000*pi/648000) * 978462
high$Gmag_abs <- high$phot_g_mean_mag - 5 * log10(high$r_med_geo) + 5
sample$Gmag_abs <- sample$phot_g_mean_mag - 5 * log10(sample$r_med_geo) + 5
high$RPmag_abs <- high$phot_rp_mean_mag - 5 * log10(high$r_med_geo) + 5
sample$RPmag_abs <- sample$phot_rp_mean_mag - 5 * log10(sample$r_med_geo) + 5
high$BPmag_abs <- high$phot_bp_mean_mag - 5 * log10(high$r_med_geo) + 5
sample$BPmag_abs <- sample$phot_bp_mean_mag - 5 * log10(sample$r_med_geo) + 5
data1<-subset(sample,select = c(v_tan,Gmag_abs,RPmag_abs,BPmag_abs,bp_rp,bp_g,g_rp))
data2<-subset(high,select = c(v_tan,Gmag_abs,RPmag_abs,BPmag_abs,bp_rp,bp_g,g_rp))
data1$group<-"general"
data2$group<-"high"
data1<-na.omit(data1)
data2<-na.omit(data2)
data3<-rbind(data1, data2)
ggplot(data3, aes(x=v_tan, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+
geom_density(alpha=0.6)+
labs(title="velocity plot",x="velocity", y = "Density")+
theme_classic()

ggplot(data3, aes(x=Gmag_abs, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="magnitude(green) plot",x="magnitude", y = "Density")+
theme_classic()

ggplot(data3, aes(x=RPmag_abs, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="magnitude(red) plot",x="magnitude", y = "Density")+
theme_classic()

ggplot(data3, aes(x=BPmag_abs, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="magnitude(blue) plot",x="magnitude", y = "Density")+
theme_classic()

ggplot(data3, aes(x=bp_rp, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="color(blue-red) plot",x="Difference", y = "Density")+
theme_classic()

ggplot(data3, aes(x=bp_g, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="color(blue-green) plot",x="Difference", y = "Density")+
theme_classic()

ggplot(data3, aes(x=g_rp, color=group, fill=group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+geom_density(alpha=.01,) +
labs(title="color(green-red) plot",x="Difference", y = "Density")+
theme_classic()

variance Analysis(Bartlett’s test)

bartlett.test(list(data1$v_tan,data2$v_tan))
## 
##  Bartlett test of homogeneity of variances
## 
## data:  list(data1$v_tan, data2$v_tan)
## Bartlett's K-squared = 6475, df = 1, p-value <2e-16

Brown-Forsythe’s test

library(onewaytests)
bf.test(v_tan ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : v_tan and group 
## 
##   statistic  : 191646 
##   num df     : 1 
##   denom df   : 5592 
##   p.value    : 0 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------

According to the graph this data is not normal distribution. according to the bartlett test and Brown-Forsythe’s test, variances of this two variables are heterogeneous. Therfore we can not use normal anova analysis tools like t-test or f test. wlech test

oneway.test(v_tan ~ group, data = data3, var.equal = FALSE)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  v_tan and group
## F = 2e+05, num df = 1, denom df = 5592, p-value <2e-16

We choose Kruskal-Wallis Test This test is the nonparametric equivalent of the one-way ANOVA and is typically used when the normality assumption is violated.

Kruskal-Wallis Test

kruskal.test(v_tan ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  v_tan by group
## Kruskal-Wallis chi-squared = 9698, df = 1, p-value <2e-16

Both test shows that we can reject null hypothesis, there are siginficant different between high group and general group

for other varibale

bf.test(Gmag_abs ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : Gmag_abs and group 
## 
##   statistic  : 269 
##   num df     : 1 
##   denom df   : 13609 
##   p.value    : 6.96e-60 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
bf.test(RPmag_abs ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : RPmag_abs and group 
## 
##   statistic  : 370 
##   num df     : 1 
##   denom df   : 13657 
##   p.value    : 2e-81 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
bf.test(BPmag_abs ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : BPmag_abs and group 
## 
##   statistic  : 188 
##   num df     : 1 
##   denom df   : 13589 
##   p.value    : 1.75e-42 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
bf.test(bp_rp ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : bp_rp and group 
## 
##   statistic  : 119 
##   num df     : 1 
##   denom df   : 13512 
##   p.value    : 1.11e-27 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
bf.test(bp_g ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : bp_g and group 
## 
##   statistic  : 69.7 
##   num df     : 1 
##   denom df   : 13801 
##   p.value    : 7.38e-17 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
bf.test(g_rp ~ group, data = data3)
## 
##   Brown-Forsythe Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : g_rp and group 
## 
##   statistic  : 111 
##   num df     : 1 
##   denom df   : 12986 
##   p.value    : 9.03e-26 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
kruskal.test(Gmag_abs ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Gmag_abs by group
## Kruskal-Wallis chi-squared = 111, df = 1, p-value <2e-16
kruskal.test(RPmag_abs ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  RPmag_abs by group
## Kruskal-Wallis chi-squared = 213, df = 1, p-value <2e-16
kruskal.test(BPmag_abs ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  BPmag_abs by group
## Kruskal-Wallis chi-squared = 31, df = 1, p-value = 2e-08
kruskal.test(bp_rp ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  bp_rp by group
## Kruskal-Wallis chi-squared = 149, df = 1, p-value <2e-16
kruskal.test(bp_g ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  bp_g by group
## Kruskal-Wallis chi-squared = 57, df = 1, p-value = 5e-14
kruskal.test(g_rp ~ group, data = data3)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  g_rp by group
## Kruskal-Wallis chi-squared = 212, df = 1, p-value <2e-16
df_general <- read.csv("general_sample_result.csv")
df_general$sample <- "General Sample"
df_high <- read.csv("v_tan_500km_s-result.csv")
df_high$sample <- "High Velocity Sample"
df <- rbind(df_general, df_high)

# calculate tangential velocity r_med_geo*sin(pm/1000*PI()/648000) * 978462
df$v_tan <- df$r_med_geo * sin(df$pm/1000*pi/648000) * 978462

# calculate absolute magnitudes from r_med_geo distance
df$Gmag_abs <- df$phot_g_mean_mag - 5 * log10(df$r_med_geo) + 5
df$RPmag_abs <- df$phot_rp_mean_mag - 5 * log10(df$r_med_geo) + 5
df$BPmag_abs <- df$phot_bp_mean_mag - 5 * log10(df$r_med_geo) + 5
# columns of data
colnames(df)
##   [1] "solution_id"                      "designation"                     
##   [3] "source_id"                        "random_index"                    
##   [5] "ref_epoch"                        "ra"                              
##   [7] "ra_error"                         "dec"                             
##   [9] "dec_error"                        "parallax"                        
##  [11] "parallax_error"                   "parallax_over_error"             
##  [13] "pm"                               "pmra"                            
##  [15] "pmra_error"                       "pmdec"                           
##  [17] "pmdec_error"                      "ra_dec_corr"                     
##  [19] "ra_parallax_corr"                 "ra_pmra_corr"                    
##  [21] "ra_pmdec_corr"                    "dec_parallax_corr"               
##  [23] "dec_pmra_corr"                    "dec_pmdec_corr"                  
##  [25] "parallax_pmra_corr"               "parallax_pmdec_corr"             
##  [27] "pmra_pmdec_corr"                  "astrometric_n_obs_al"            
##  [29] "astrometric_n_obs_ac"             "astrometric_n_good_obs_al"       
##  [31] "astrometric_n_bad_obs_al"         "astrometric_gof_al"              
##  [33] "astrometric_chi2_al"              "astrometric_excess_noise"        
##  [35] "astrometric_excess_noise_sig"     "astrometric_params_solved"       
##  [37] "astrometric_primary_flag"         "nu_eff_used_in_astrometry"       
##  [39] "pseudocolour"                     "pseudocolour_error"              
##  [41] "ra_pseudocolour_corr"             "dec_pseudocolour_corr"           
##  [43] "parallax_pseudocolour_corr"       "pmra_pseudocolour_corr"          
##  [45] "pmdec_pseudocolour_corr"          "astrometric_matched_transits"    
##  [47] "visibility_periods_used"          "astrometric_sigma5d_max"         
##  [49] "matched_transits"                 "new_matched_transits"            
##  [51] "matched_transits_removed"         "ipd_gof_harmonic_amplitude"      
##  [53] "ipd_gof_harmonic_phase"           "ipd_frac_multi_peak"             
##  [55] "ipd_frac_odd_win"                 "ruwe"                            
##  [57] "scan_direction_strength_k1"       "scan_direction_strength_k2"      
##  [59] "scan_direction_strength_k3"       "scan_direction_strength_k4"      
##  [61] "scan_direction_mean_k1"           "scan_direction_mean_k2"          
##  [63] "scan_direction_mean_k3"           "scan_direction_mean_k4"          
##  [65] "duplicated_source"                "phot_g_n_obs"                    
##  [67] "phot_g_mean_flux"                 "phot_g_mean_flux_error"          
##  [69] "phot_g_mean_flux_over_error"      "phot_g_mean_mag"                 
##  [71] "phot_bp_n_obs"                    "phot_bp_mean_flux"               
##  [73] "phot_bp_mean_flux_error"          "phot_bp_mean_flux_over_error"    
##  [75] "phot_bp_mean_mag"                 "phot_rp_n_obs"                   
##  [77] "phot_rp_mean_flux"                "phot_rp_mean_flux_error"         
##  [79] "phot_rp_mean_flux_over_error"     "phot_rp_mean_mag"                
##  [81] "phot_bp_rp_excess_factor"         "phot_bp_n_contaminated_transits" 
##  [83] "phot_bp_n_blended_transits"       "phot_rp_n_contaminated_transits" 
##  [85] "phot_rp_n_blended_transits"       "phot_proc_mode"                  
##  [87] "bp_rp"                            "bp_g"                            
##  [89] "g_rp"                             "radial_velocity"                 
##  [91] "radial_velocity_error"            "rv_method_used"                  
##  [93] "rv_nb_transits"                   "rv_nb_deblended_transits"        
##  [95] "rv_visibility_periods_used"       "rv_expected_sig_to_noise"        
##  [97] "rv_renormalised_gof"              "rv_chisq_pvalue"                 
##  [99] "rv_time_duration"                 "rv_amplitude_robust"             
## [101] "rv_template_teff"                 "rv_template_logg"                
## [103] "rv_template_fe_h"                 "rv_atm_param_origin"             
## [105] "vbroad"                           "vbroad_error"                    
## [107] "vbroad_nb_transits"               "grvs_mag"                        
## [109] "grvs_mag_error"                   "grvs_mag_nb_transits"            
## [111] "rvs_spec_sig_to_noise"            "phot_variable_flag"              
## [113] "l"                                "b"                               
## [115] "ecl_lon"                          "ecl_lat"                         
## [117] "in_qso_candidates"                "in_galaxy_candidates"            
## [119] "non_single_star"                  "has_xp_continuous"               
## [121] "has_xp_sampled"                   "has_rvs"                         
## [123] "has_epoch_photometry"             "has_epoch_rv"                    
## [125] "has_mcmc_gspphot"                 "has_mcmc_msc"                    
## [127] "in_andromeda_survey"              "classprob_dsc_combmod_quasar"    
## [129] "classprob_dsc_combmod_galaxy"     "classprob_dsc_combmod_star"      
## [131] "teff_gspphot"                     "teff_gspphot_lower"              
## [133] "teff_gspphot_upper"               "logg_gspphot"                    
## [135] "logg_gspphot_lower"               "logg_gspphot_upper"              
## [137] "mh_gspphot"                       "mh_gspphot_lower"                
## [139] "mh_gspphot_upper"                 "distance_gspphot"                
## [141] "distance_gspphot_lower"           "distance_gspphot_upper"          
## [143] "azero_gspphot"                    "azero_gspphot_lower"             
## [145] "azero_gspphot_upper"              "ag_gspphot"                      
## [147] "ag_gspphot_lower"                 "ag_gspphot_upper"                
## [149] "ebpminrp_gspphot"                 "ebpminrp_gspphot_lower"          
## [151] "ebpminrp_gspphot_upper"           "libname_gspphot"                 
## [153] "solution_id.1"                    "source_id.1"                     
## [155] "classprob_dsc_combmod_quasar.1"   "classprob_dsc_combmod_galaxy.1"  
## [157] "classprob_dsc_combmod_star.1"     "classprob_dsc_combmod_whitedwarf"
## [159] "classprob_dsc_combmod_binarystar" "classprob_dsc_specmod_quasar"    
## [161] "classprob_dsc_specmod_galaxy"     "classprob_dsc_specmod_star"      
## [163] "classprob_dsc_specmod_whitedwarf" "classprob_dsc_specmod_binarystar"
## [165] "classprob_dsc_allosmod_quasar"    "classprob_dsc_allosmod_galaxy"   
## [167] "classprob_dsc_allosmod_star"      "teff_gspphot.1"                  
## [169] "teff_gspphot_lower.1"             "teff_gspphot_upper.1"            
## [171] "logg_gspphot.1"                   "logg_gspphot_lower.1"            
## [173] "logg_gspphot_upper.1"             "mh_gspphot.1"                    
## [175] "mh_gspphot_lower.1"               "mh_gspphot_upper.1"              
## [177] "distance_gspphot.1"               "distance_gspphot_lower.1"        
## [179] "distance_gspphot_upper.1"         "azero_gspphot.1"                 
## [181] "azero_gspphot_lower.1"            "azero_gspphot_upper.1"           
## [183] "ag_gspphot.1"                     "ag_gspphot_lower.1"              
## [185] "ag_gspphot_upper.1"               "abp_gspphot"                     
## [187] "abp_gspphot_lower"                "abp_gspphot_upper"               
## [189] "arp_gspphot"                      "arp_gspphot_lower"               
## [191] "arp_gspphot_upper"                "ebpminrp_gspphot.1"              
## [193] "ebpminrp_gspphot_lower.1"         "ebpminrp_gspphot_upper.1"        
## [195] "mg_gspphot"                       "mg_gspphot_lower"                
## [197] "mg_gspphot_upper"                 "radius_gspphot"                  
## [199] "radius_gspphot_lower"             "radius_gspphot_upper"            
## [201] "logposterior_gspphot"             "mcmcaccept_gspphot"              
## [203] "libname_gspphot.1"                "teff_gspspec"                    
## [205] "teff_gspspec_lower"               "teff_gspspec_upper"              
## [207] "logg_gspspec"                     "logg_gspspec_lower"              
## [209] "logg_gspspec_upper"               "mh_gspspec"                      
## [211] "mh_gspspec_lower"                 "mh_gspspec_upper"                
## [213] "alphafe_gspspec"                  "alphafe_gspspec_lower"           
## [215] "alphafe_gspspec_upper"            "fem_gspspec"                     
## [217] "fem_gspspec_lower"                "fem_gspspec_upper"               
## [219] "fem_gspspec_nlines"               "fem_gspspec_linescatter"         
## [221] "sife_gspspec"                     "sife_gspspec_lower"              
## [223] "sife_gspspec_upper"               "sife_gspspec_nlines"             
## [225] "sife_gspspec_linescatter"         "cafe_gspspec"                    
## [227] "cafe_gspspec_lower"               "cafe_gspspec_upper"              
## [229] "cafe_gspspec_nlines"              "cafe_gspspec_linescatter"        
## [231] "tife_gspspec"                     "tife_gspspec_lower"              
## [233] "tife_gspspec_upper"               "tife_gspspec_nlines"             
## [235] "tife_gspspec_linescatter"         "mgfe_gspspec"                    
## [237] "mgfe_gspspec_lower"               "mgfe_gspspec_upper"              
## [239] "mgfe_gspspec_nlines"              "mgfe_gspspec_linescatter"        
## [241] "ndfe_gspspec"                     "ndfe_gspspec_lower"              
## [243] "ndfe_gspspec_upper"               "ndfe_gspspec_nlines"             
## [245] "ndfe_gspspec_linescatter"         "feiim_gspspec"                   
## [247] "feiim_gspspec_lower"              "feiim_gspspec_upper"             
## [249] "feiim_gspspec_nlines"             "feiim_gspspec_linescatter"       
## [251] "sfe_gspspec"                      "sfe_gspspec_lower"               
## [253] "sfe_gspspec_upper"                "sfe_gspspec_nlines"              
## [255] "sfe_gspspec_linescatter"          "zrfe_gspspec"                    
## [257] "zrfe_gspspec_lower"               "zrfe_gspspec_upper"              
## [259] "zrfe_gspspec_nlines"              "zrfe_gspspec_linescatter"        
## [261] "nfe_gspspec"                      "nfe_gspspec_lower"               
## [263] "nfe_gspspec_upper"                "nfe_gspspec_nlines"              
## [265] "nfe_gspspec_linescatter"          "crfe_gspspec"                    
## [267] "crfe_gspspec_lower"               "crfe_gspspec_upper"              
## [269] "crfe_gspspec_nlines"              "crfe_gspspec_linescatter"        
## [271] "cefe_gspspec"                     "cefe_gspspec_lower"              
## [273] "cefe_gspspec_upper"               "cefe_gspspec_nlines"             
## [275] "cefe_gspspec_linescatter"         "nife_gspspec"                    
## [277] "nife_gspspec_lower"               "nife_gspspec_upper"              
## [279] "nife_gspspec_nlines"              "nife_gspspec_linescatter"        
## [281] "cn0ew_gspspec"                    "cn0ew_gspspec_uncertainty"       
## [283] "cn0_gspspec_centralline"          "cn0_gspspec_width"               
## [285] "dib_gspspec_lambda"               "dib_gspspec_lambda_uncertainty"  
## [287] "dibew_gspspec"                    "dibew_gspspec_uncertainty"       
## [289] "dibewnoise_gspspec_uncertainty"   "dibp0_gspspec"                   
## [291] "dibp2_gspspec"                    "dibp2_gspspec_uncertainty"       
## [293] "dibqf_gspspec"                    "flags_gspspec"                   
## [295] "logchisq_gspspec"                 "ew_espels_halpha"                
## [297] "ew_espels_halpha_uncertainty"     "ew_espels_halpha_flag"           
## [299] "ew_espels_halpha_model"           "classlabel_espels"               
## [301] "classlabel_espels_flag"           "classprob_espels_wcstar"         
## [303] "classprob_espels_wnstar"          "classprob_espels_bestar"         
## [305] "classprob_espels_ttauristar"      "classprob_espels_herbigstar"     
## [307] "classprob_espels_dmestar"         "classprob_espels_pne"            
## [309] "azero_esphs"                      "azero_esphs_uncertainty"         
## [311] "ag_esphs"                         "ag_esphs_uncertainty"            
## [313] "ebpminrp_esphs"                   "ebpminrp_esphs_uncertainty"      
## [315] "teff_esphs"                       "teff_esphs_uncertainty"          
## [317] "logg_esphs"                       "logg_esphs_uncertainty"          
## [319] "vsini_esphs"                      "vsini_esphs_uncertainty"         
## [321] "flags_esphs"                      "spectraltype_esphs"              
## [323] "activityindex_espcs"              "activityindex_espcs_uncertainty" 
## [325] "activityindex_espcs_input"        "teff_espucd"                     
## [327] "teff_espucd_uncertainty"          "flags_espucd"                    
## [329] "radius_flame"                     "radius_flame_lower"              
## [331] "radius_flame_upper"               "lum_flame"                       
## [333] "lum_flame_lower"                  "lum_flame_upper"                 
## [335] "mass_flame"                       "mass_flame_lower"                
## [337] "mass_flame_upper"                 "age_flame"                       
## [339] "age_flame_lower"                  "age_flame_upper"                 
## [341] "flags_flame"                      "evolstage_flame"                 
## [343] "gravredshift_flame"               "gravredshift_flame_lower"        
## [345] "gravredshift_flame_upper"         "bc_flame"                        
## [347] "mh_msc"                           "mh_msc_upper"                    
## [349] "mh_msc_lower"                     "azero_msc"                       
## [351] "azero_msc_upper"                  "azero_msc_lower"                 
## [353] "distance_msc"                     "distance_msc_upper"              
## [355] "distance_msc_lower"               "teff_msc1"                       
## [357] "teff_msc1_upper"                  "teff_msc1_lower"                 
## [359] "teff_msc2"                        "teff_msc2_upper"                 
## [361] "teff_msc2_lower"                  "logg_msc1"                       
## [363] "logg_msc1_upper"                  "logg_msc1_lower"                 
## [365] "logg_msc2"                        "logg_msc2_upper"                 
## [367] "logg_msc2_lower"                  "ag_msc"                          
## [369] "ag_msc_upper"                     "ag_msc_lower"                    
## [371] "logposterior_msc"                 "mcmcaccept_msc"                  
## [373] "mcmcdrift_msc"                    "flags_msc"                       
## [375] "neuron_oa_id"                     "neuron_oa_dist"                  
## [377] "neuron_oa_dist_percentile_rank"   "flags_oa"                        
## [379] "source_id.2"                      "r_med_geo"                       
## [381] "r_lo_geo"                         "r_hi_geo"                        
## [383] "r_med_photogeo"                   "r_lo_photogeo"                   
## [385] "r_hi_photogeo"                    "flag"                            
## [387] "sample"                           "v_tan"                           
## [389] "Gmag_abs"                         "RPmag_abs"                       
## [391] "BPmag_abs"
# print mean of mh_msc for the two samples
mean(df_general$mh_msc, na.rm = T)
## [1] -0.0617
mean(df_high$mh_msc, na.rm = T)
## [1] -0.178
# print number of non-NA values in mh_msc for the two samples
sum(!is.na(df_general$mh_msc))
## [1] 4451
sum(!is.na(df_high$mh_msc))
## [1] 1267
# compare differences in mh_msc between the two samples
t.test(df_general$mh_msc, df_high$mh_msc, var.equal = F)
## 
##  Welch Two Sample t-test
## 
## data:  df_general$mh_msc and df_high$mh_msc
## t = 12, df = 1610, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0976 0.1359
## sample estimates:
## mean of x mean of y 
##   -0.0617   -0.1785
t.test(df_general$age_flame, df_high$age_flame, var.equal = F)
## 
##  Welch Two Sample t-test
## 
## data:  df_general$age_flame and df_high$age_flame
## t = 10, df = 233, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.94 2.90
## sample estimates:
## mean of x mean of y 
##      6.50      4.07
# make histogram of mh_msc for the two samples, normalize to  frequency
hist <- ggplot(df, aes(x=mh_msc, fill=sample)) + 
  geom_histogram(aes(y=..density..), position="identity", alpha=0.5, bins=50) + 
  scale_fill_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="MH_MSC", y="Frequency")

ggplotly(hist)
# save plotly figure
p <- ggplotly(hist)
htmlwidgets::saveWidget(p, "hist.html")
# also save as png
ggsave("mh_hist.png", width=8, height=6, units="in", dpi=300)
t.test(df_general$r_med_geo, df_high$r_med_geo, var.equal = F)
## 
##  Welch Two Sample t-test
## 
## data:  df_general$r_med_geo and df_high$r_med_geo
## t = -33, df = 11598, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -125 -111
## sample estimates:
## mean of x mean of y 
##       643       761
# histogram of distance
hist <- ggplot(df, aes(x=r_med_geo, fill=sample)) + 
  geom_histogram(aes(y=..density..), position="identity", alpha=0.5, bins=50) + 
  scale_fill_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="Distance (pc)", y="Frequency")

ggplotly(hist)
# save as png
ggsave("dist_hist.png", width=8, height=6, units="in", dpi=300)
# histogram of galactic latitude
hist <- ggplot(df, aes(x=b, fill=sample)) + 
  geom_histogram(aes(y=..density..), position="identity", alpha=0.5, bins=50) + 
  scale_fill_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="Galactic Latitude (b)", y="Frequency")

ggsave("b_hist.png", width=8, height=6, units="in", dpi=300)
# number of empty values in ebpminrp_gspphot
sum(is.na(df_general$ebpminrp_gspphot))
## [1] 5047
sum(is.na(df_high$ebpminrp_gspphot))
## [1] 3176
# histogram of E(BP-RP) extinction
hist <- ggplot(df, aes(x=ebpminrp_gspphot, fill=sample)) + 
  geom_histogram(aes(y=..density..), position="identity", alpha=0.5, bins=50) + 
  scale_fill_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="E(BP-RP)", y="Frequency")

ggsave("ebp-rp_hist.png", width=8, height=6, units="in", dpi=300)
# make Gaia CMD of high velocity and general sample stars using phot_g_mean_mag and bp_rp columns using plotly. overlay the two CMDs on the same plot.
cmd <- ggplot(df, aes(x=bp_rp, y=Gmag_abs, text=paste('designation: ', designation))) + 
  geom_point(aes(color=sample), alpha=0.25, ) + 
  scale_color_manual(values=c("red", "blue"), name='Sample') + 
  # invert y axis
  scale_y_reverse() +
  theme_bw() + 
  labs(x="BP-RP", y="G")

# ggplotly(cmd)

# save plotly figure
p <- ggplotly(cmd)
htmlwidgets::saveWidget(p, "cmd.html")
# also save as png
ggsave("cmd.png", width=8, height=6, units="in", dpi=300)
# correct color for extinction
df$bp_rp_corr <- df$bp_rp - df$ebpminrp_gspphot
# correct Gmag for extinction based on A_G = 2 * E(BP-RP)
df$Gmag_abs_corr <- df$Gmag_abs - 2 * df$ebpminrp_gspphot

cmd_corr <- ggplot(df, aes(x=bp_rp_corr, y=Gmag_abs_corr, text=paste('designation: ', designation))) + 
  geom_point(aes(color=sample), alpha=0.25, ) + 
  scale_color_manual(values=c("red", "blue"), name='Sample') + 
  # invert y axis
  scale_y_reverse() +
  theme_bw() + 
  labs(x="BP-RP Dereddened", y="G")

# ggplotly(cmd)

# save plotly figure
p <- ggplotly(cmd_corr)
htmlwidgets::saveWidget(p, "cmd_corr.html")
# also save as png
ggsave("cmd_corr.png", width=8, height=6, units="in", dpi=300)
# make color-color diagram with bp-gp and gp-rp
ccd <- ggplot(df, aes(x=bp_rp, y=bp_g)) + 
  geom_point(aes(color=sample), alpha=0.25) + 
  scale_color_manual(values=c("red", "blue"), labels=c("General Sample", "High Velocity Sample")) + 
  theme_bw() + 
  labs(x="BP-RP", y="RP-RP")

# ggplotly(ccd)

# save plotly figure
p <- ggplotly(ccd)
htmlwidgets::saveWidget(p, "ccd.html")
# also save as png
ggsave("ccd.png", width=8, height=6, units="in", dpi=300)
# select stars with Gmag_abs between 4 and 9, and bp_rp between 0.5 and 2.0

df_ms <- df %>% filter(Gmag_abs > 4 & Gmag_abs < 9 & bp_rp > 0.5 & bp_rp < 2.0)

# linear regression of bp_rp vs Gmag_abs on df_ms
fit <- lm(Gmag_abs ~ bp_rp, data=df_ms)
summary(fit)
## 
## Call:
## lm(formula = Gmag_abs ~ bp_rp, data = df_ms)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.776 -0.417 -0.056  0.434  4.190 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.3462     0.0465    50.4   <2e-16 ***
## bp_rp         3.1982     0.0324    98.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.733 on 4033 degrees of freedom
## Multiple R-squared:  0.708,  Adjusted R-squared:  0.707 
## F-statistic: 9.76e+03 on 1 and 4033 DF,  p-value: <2e-16
# test the effect of sample on the regression
fit2 <- lm(Gmag_abs ~ bp_rp + sample, data=df_ms)
summary(fit2)
## 
## Call:
## lm(formula = Gmag_abs ~ bp_rp + sample, data = df_ms)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.529 -0.287  0.060  0.313  4.436 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.1004     0.0398    52.8   <2e-16 ***
## bp_rp                        3.1975     0.0274   116.8   <2e-16 ***
## sampleHigh Velocity Sample   0.8696     0.0216    40.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.62 on 4032 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.791 
## F-statistic: 7.64e+03 on 2 and 4032 DF,  p-value: <2e-16
# test the effect of the interaction between sample and bp_rp on the regression
fit3 <- lm(Gmag_abs ~ bp_rp * sample, data=df_ms)
summary(fit3)
## 
## Call:
## lm(formula = Gmag_abs ~ bp_rp * sample, data = df_ms)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.541 -0.290  0.059  0.311  4.406 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.1561     0.0465   46.33   <2e-16 ***
## bp_rp                              3.1575     0.0324   97.50   <2e-16 ***
## sampleHigh Velocity Sample         0.6754     0.0869    7.77    1e-14 ***
## bp_rp:sampleHigh Velocity Sample   0.1394     0.0605    2.31    0.021 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.619 on 4031 degrees of freedom
## Multiple R-squared:  0.791,  Adjusted R-squared:  0.791 
## F-statistic: 5.1e+03 on 3 and 4031 DF,  p-value: <2e-16
# do anova test on fit2 vs fit, and fit3 vs fit2
anova(fit2, fit)
## Analysis of Variance Table
## 
## Model 1: Gmag_abs ~ bp_rp + sample
## Model 2: Gmag_abs ~ bp_rp
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)    
## 1   4032 1549                             
## 2   4033 2169 -1      -620 1615 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit3, fit2)
## Analysis of Variance Table
## 
## Model 1: Gmag_abs ~ bp_rp * sample
## Model 2: Gmag_abs ~ bp_rp + sample
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1   4031 1547                           
## 2   4032 1549 -1     -2.04 5.32  0.021 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot the regression line for each sample on cmd
cmd_ms1 <- ggplot(df_ms, aes(x=bp_rp, y=Gmag_abs)) + 
  geom_point(aes(color=sample), alpha=0.25) + 
  scale_color_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="BP-RP", y="G") + 
  geom_abline(aes(intercept=fit3$coefficients[1], slope=fit3$coefficients[2]), color="blue")
  # invert y axis
  # scale_y_reverse()


# ggplotly(cmd_ms1)

# save plotly figure
p <- ggplotly(cmd_ms1)
htmlwidgets::saveWidget(p, "cmd_ms1.html")
# also save as png
ggsave("cmd_ms1.png", width=8, height=6, units="in", dpi=300)
# plot the regression line for each sample on cmd
cmd_ms2 <- ggplot(df_ms, aes(x=bp_rp, y=Gmag_abs)) + 
  geom_point(aes(color=sample), alpha=0.25) + 
  scale_color_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="BP-RP", y="G") + 
  geom_abline(aes(intercept=fit2$coefficients[1], slope=fit2$coefficients[2]), color="red") + 
  geom_abline(aes(intercept=fit2$coefficients[1]+fit2$coefficients[3], slope=fit2$coefficients[2]), color="blue") 
  # invert y axis
  # scale_y_reverse()


# ggplotly(cmd_ms2)

# save plotly figure
p <- ggplotly(cmd_ms2)
htmlwidgets::saveWidget(p, "cmd_ms2.html")
# also save as png
ggsave("cmd_ms2.png", width=8, height=6, units="in", dpi=300)
# plot the regression line for each sample on cmd
cmd_ms3 <- ggplot(df_ms, aes(x=bp_rp, y=Gmag_abs)) + 
  geom_point(aes(color=sample), alpha=0.25) + 
  scale_color_manual(values=c("red", "blue"), name='Sample',) + 
  theme_bw() + 
  labs(x="BP-RP", y="G") + 
  geom_abline(aes(intercept=fit3$coefficients[1], slope=fit3$coefficients[2]), color="red", ) +
  geom_abline(aes(intercept=fit3$coefficients[1]+fit3$coefficients[3], slope=fit3$coefficients[2] + fit3$coefficients[4]), color="blue")
  # invert y axis
  # scale_y_reverse()


# ggplotly(cmd_ms2)

# save plotly figure
p <- ggplotly(cmd_ms3)
htmlwidgets::saveWidget(p, "cmd_ms3.html")
# also save as png
ggsave("cmd_ms3.png", width=8, height=6, units="in", dpi=300)
# select sources with bp_rp < 0

df_blue <- df %>% filter(bp_rp < 0)

# view the data
View(df_blue)